/**
 * Copyright (c) 2024 Huawei Technologies Co., Ltd.
 * This file is a part of the CANN Open Software.
 * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
 * Please refer to the License for details. You may not use this file except in compliance with the License.
 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
 * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
 * See LICENSE in the root of the software repository for the full text of the License.
 */

/*!
 * \file copy_cube_in_using_ub.h
 * \brief
 */

#ifndef IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_COPY_CUBE_IN_USING_UB_H
#define IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_COPY_CUBE_IN_USING_UB_H

#include "../../matmul_module.h"
#include "../../matmul_param.h"
#include "copy_cube_in_intf.h"

namespace matmul {
using namespace AscendC;

constexpr int32_t FIRST_16BIT_OFFSET_MM_API = 16;
constexpr int32_t SECOND_16BIT_OFFSET_MM_API = 32;
constexpr int32_t THIRD_16BIT_OFFSET_MM_API = 48;
constexpr int32_t REPEAT_BLOCK_NUM_MM_API = 8;
constexpr int32_t EACH_BLOCK_BYTES_MM_API = 32;
constexpr int32_t CACHE_LINE_SIZE_MM_API = 512;
constexpr int32_t TRANS_DATA_ARRAY_SIZE_MM_API = 16;
constexpr int32_t ANTI_QUANT_ALIGN_SIZE_MM_API = 32;
constexpr int32_t MAX_BLOCK_COUNT_SIZE_MM_API = 4095;

template <typename IMPL, class INPUT_TYPE, const auto& MM_CFG>
class CopyCubeIn<IMPL, INPUT_TYPE, MM_CFG,
    enable_if_t<MatmulFeatureTrait<MM_CFG>::IsNeedUB() &&
    ((GetCopyCubeInType<INPUT_TYPE, MM_CFG>() == CopyCubeInType::NORMAL) ||
    (GetCopyCubeInType<INPUT_TYPE, MM_CFG>() == CopyCubeInType::MDL))>>
{
    MATMUL_USE_MODULE_ON(CubeInBuffer, INPUT_TYPE::TAG);
    MATMUL_USE_MODULE_ON(CopyCubeInParams, INPUT_TYPE::TAG);
    MATMUL_USE_MODULE_ON(MatmulVar, INPUT_TYPE::TAG);
    MATMUL_USE_MODULE_ON(MatmulShapeInfo, INPUT_TYPE::TAG);
    MATMUL_USE_MODULE_ON(MatmulTensorInfo, INPUT_TYPE::TAG);
    MATMUL_USE_MODULE_ON(MatmulShapeTiling, INPUT_TYPE::TAG);
    MATMUL_USE_MODULE(MatmulAntiQuantProcessor);
    using TRANS_T = typename INPUT_TYPE::TRANS_T;
    using SRC_T = typename INPUT_TYPE::T;

    __aicore__ auto constexpr GetBaseUseN() const {
        return MATMUL_CONST_PARAM_VAR.baseUseN_;
    }
    
    __aicore__ auto constexpr GetBaseUseStepKb() const {
        return MATMUL_CONST_PARAM_VAR.baseUseStepKb_;
    }

    template <typename INPUT_TYPE_ALIAS = INPUT_TYPE>
    __aicore__ constexpr enable_if_t<INPUT_TYPE_ALIAS::TAG == InputTypeTag::A, int32_t> GetCopyHeight(int32_t i) const
    {
        return (MATMUL_CONST_PARAM_VAR.stepMIdx_ + i >= MATMUL_CONST_PARAM_VAR.mStepIter_ - 1)
                   ? MATMUL_CONST_PARAM_VAR.tailStepM_
                   : MATMUL_CONST_PARAM_VAR.tiling_.GetStepM() * MATMUL_CONST_PARAM_VAR.tiling_.GetBaseM();
    }

    template <typename INPUT_TYPE_ALIAS = INPUT_TYPE>
    __aicore__ constexpr enable_if_t<INPUT_TYPE_ALIAS::TAG == InputTypeTag::B, int32_t> GetCopyHeight(int32_t i) const
    {
        return (MATMUL_CONST_PARAM_VAR.stepNIdx_ + i >= MATMUL_CONST_PARAM_VAR.nStepIter_ - 1)
                   ? MATMUL_CONST_PARAM_VAR.tailStepN_
                   : MATMUL_CONST_PARAM_VAR.tiling_.GetStepN() * MATMUL_CONST_PARAM_VAR.tiling_.GetBaseN();
    }

    template <bool IS_TRANS = false, typename INPUT_TYPE_ALIAS = INPUT_TYPE>
    __aicore__ constexpr enable_if_t<INPUT_TYPE_ALIAS::TAG == InputTypeTag::A, int32_t>
    GetCopyWidth(int32_t i, int32_t baseWidth) const
    {
        if constexpr (IS_TRANS) {
            return (MATMUL_CONST_PARAM_VAR.stepMIdx_ + i >= MATMUL_CONST_PARAM_VAR.mStepIter_ - 1)
                       ? MATMUL_CONST_PARAM_VAR.tailStepM_
                       : MATMUL_CONST_PARAM_VAR.tiling_.GetStepM() * baseWidth;
        } else {
            return (MATMUL_CONST_PARAM_VAR.stepKaIdx_ + i >= MATMUL_CONST_PARAM_VAR.kaStepIter_ - 1)
                       ? MATMUL_CONST_PARAM_VAR.tailStepKa_
                       : MATMUL_CONST_PARAM_VAR.tiling_.GetStepKa() * baseWidth;
        }
    }

    template <bool IS_TRANS = false, typename INPUT_TYPE_ALIAS = INPUT_TYPE>
    __aicore__ constexpr enable_if_t<INPUT_TYPE_ALIAS::TAG == InputTypeTag::B, int32_t>
    GetCopyWidth(int32_t i, int32_t baseWidth) const
    {
        return (MATMUL_CONST_PARAM_VAR.stepNIdx_ + i >= MATMUL_CONST_PARAM_VAR.nStepIter_ - 1)
                   ? MATMUL_CONST_PARAM_VAR.tailStepN_
                   : MATMUL_CONST_PARAM_VAR.tiling_.GetStepN() * baseWidth;
    }

    template <typename INPUT_TYPE_ALIAS = INPUT_TYPE>
    __aicore__ constexpr enable_if_t<INPUT_TYPE_ALIAS::TAG == InputTypeTag::A, bool> IsBufferPosEnd(int32_t i) const
    {
        return MATMUL_CONST_PARAM_VAR.stepMIdx_ + i >= MATMUL_CONST_PARAM_VAR.mStepIter_;
    }

    template <typename INPUT_TYPE_ALIAS = INPUT_TYPE>
    __aicore__ constexpr enable_if_t<INPUT_TYPE_ALIAS::TAG == InputTypeTag::A, bool> IsBufferPosEnd() const
    {
        return MATMUL_CONST_PARAM_VAR.stepMIdx_ == MATMUL_CONST_PARAM_VAR.mStepIter_ - 1;
    }

    template <typename INPUT_TYPE_ALIAS = INPUT_TYPE>
    __aicore__ constexpr enable_if_t<INPUT_TYPE_ALIAS::TAG == InputTypeTag::B, bool> IsBufferPosEnd(int32_t i) const
    {
        return MATMUL_CONST_PARAM_VAR.stepNIdx_ + i >= MATMUL_CONST_PARAM_VAR.nStepIter_;
    }

    template <typename INPUT_TYPE_ALIAS = INPUT_TYPE>
    __aicore__ constexpr enable_if_t<INPUT_TYPE_ALIAS::TAG == InputTypeTag::B, bool> IsBufferPosEnd() const
    {
        return MATMUL_CONST_PARAM_VAR.stepNIdx_ == MATMUL_CONST_PARAM_VAR.nStepIter_ - 1;
    }

    template <typename INPUT_TYPE_ALIAS = INPUT_TYPE>
    __aicore__ constexpr enable_if_t<INPUT_TYPE_ALIAS::TAG == InputTypeTag::A, bool> IsBufferKPosEnd(int32_t i) const
    {
        return MATMUL_CONST_PARAM_VAR.stepKaIdx_ + i >= MATMUL_CONST_PARAM_VAR.kaStepIter_;
    }

    template <typename INPUT_TYPE_ALIAS = INPUT_TYPE>
    __aicore__ constexpr enable_if_t<INPUT_TYPE_ALIAS::TAG == InputTypeTag::A, bool> IsBufferKPosEnd() const
    {
        return MATMUL_CONST_PARAM_VAR.stepKaIdx_ == MATMUL_CONST_PARAM_VAR.kaStepIter_ - 1;
    }

    template <typename INPUT_TYPE_ALIAS = INPUT_TYPE>
    __aicore__ constexpr enable_if_t<INPUT_TYPE_ALIAS::TAG == InputTypeTag::B, bool> IsBufferKPosEnd(int32_t i) const
    {
        return MATMUL_CONST_PARAM_VAR.stepKbIdx_ + i >= MATMUL_CONST_PARAM_VAR.kbStepIter_;
    }

    template <typename INPUT_TYPE_ALIAS = INPUT_TYPE>
    __aicore__ constexpr enable_if_t<INPUT_TYPE_ALIAS::TAG == InputTypeTag::B, bool> IsBufferKPosEnd() const
    {
        return MATMUL_CONST_PARAM_VAR.stepKbIdx_ == MATMUL_CONST_PARAM_VAR.kbStepIter_ - 1;
    }

    template <typename INPUT_TYPE_ALIAS = INPUT_TYPE>
    __aicore__ constexpr enable_if_t<INPUT_TYPE_ALIAS::TAG == InputTypeTag::A, bool> IsL1KFullLoad() const
    {
        return MATMUL_CONST_PARAM_VAR.isA1KFullLoad_;
    }

    template <typename INPUT_TYPE_ALIAS = INPUT_TYPE>
    __aicore__ constexpr enable_if_t<INPUT_TYPE_ALIAS::TAG == InputTypeTag::B, bool> IsL1KFullLoad() const
    {
        return MATMUL_CONST_PARAM_VAR.isB1KFullLoad_;
    }

public:
    inline __aicore__ CopyCubeIn() = default;
    inline __aicore__ ~CopyCubeIn() = default;

    __aicore__ inline void Init()
    {
        MATMUL_MODULE(CubeInBuffer)->Init(MATMUL_MODULE(CopyCubeInParams)->GetBufferSize(), MATMUL_MODULE(MatmulShapeTiling)->GetDepth());

        if constexpr (DoMatmulMDL(MM_CFG) && ToMatmulConfig(MM_CFG).enableL1CacheUB) {
            if (MATMUL_MODULE(MatmulVar)->GetDepthL1CacheUB() > 0) {
                GetTPipePtr()->InitBuffer(
                    qidUBCache_, 1,
                    MATMUL_MODULE(MatmulVar)->GetDepthL1CacheUB() * MATMUL_MODULE(CopyCubeInParams)->GetStepCol() *
                        MATMUL_MODULE(CopyCubeInParams)->GetStepRow() * MATMUL_MODULE(CopyCubeInParams)->GetBufferSize() * sizeof(SRC_T));
            }
        }
    }

    __aicore__ inline void SetInput(const TBuffAddr& address, bool isTranspose)
    {
        // do Set Local Input here
        MATMUL_MODULE(MatmulTensorInfo)->SetLocalAddr(address, isTranspose);
        MATMUL_MODULE(CubeInBuffer)->Reset();
    }

    __aicore__ inline void SetInput(__gm__ SRC_T *srcGlobalAddr, bool isTranspose)
    {
        // do Set Global Input here
        MATMUL_MODULE(MatmulTensorInfo)->SetGlobalAddr(srcGlobalAddr, isTranspose);
        MATMUL_MODULE(CubeInBuffer)->Reset();
    }

    __aicore__ inline void Reset()
    {
        MATMUL_MODULE(CubeInBuffer)->Reset();
    }

    __aicore__ inline LocalTensor<TRANS_T> LoadData(int curRow, int curCol, int tileHeight, int tileWidth)
    {
        LocalTensor<TRANS_T> l1;
        if constexpr (DoMatmulMDL(MM_CFG)) {
            auto posL1 = MATMUL_MODULE(CubeInBuffer)->GetIterIndex(curRow, curCol);
            auto bufferPos = MATMUL_MODULE(CopyCubeInParams)->GetBufferPos();
            if (MATMUL_MODULE(CubeInBuffer)->Hit(posL1, bufferPos)) {
                l1 = MATMUL_MODULE(CubeInBuffer)->GetBuffer(posL1, bufferPos);
            } else {
                l1 = MATMUL_MODULE(CubeInBuffer)->AllocTensor(bufferPos);
                if (MATMUL_MODULE(MatmulShapeInfo)->IsTranspose()) {
                    CopyTileToCube<true>(l1, curCol, curRow, tileWidth, tileHeight);
                } else {
                    CopyTileToCube<false>(l1, curRow, curCol, tileHeight, tileWidth);
                }
                MATMUL_MODULE(CubeInBuffer)->EnQue(l1);
                MATMUL_MODULE(CubeInBuffer)->DeQue();
            }
        } else {
            auto posL1 = MATMUL_MODULE(CubeInBuffer)->GetIterIndex(curRow, curCol);
            if (MATMUL_MODULE(CubeInBuffer)->Hit(posL1)) {
                l1 = MATMUL_MODULE(CubeInBuffer)->GetBuffer(posL1);
            } else {
                l1 = MATMUL_MODULE(CubeInBuffer)->AllocTensor(posL1);
                if (MATMUL_MODULE(MatmulShapeInfo)->IsTranspose()) {
                    CopyTileToCube<true>(l1, curCol, curRow, tileWidth, tileHeight);
                } else {
                    CopyTileToCube<false>(l1, curRow, curCol, tileHeight, tileWidth);
                }
                MATMUL_MODULE(CubeInBuffer)->EnQue(l1);
                MATMUL_MODULE(CubeInBuffer)->DeQue();
            }
        }
        return l1;
    }

    __aicore__ inline void ClearLoadData(const LocalTensor<TRANS_T>& aMatrix = NULL_TENSOR<TRANS_T>,
        int32_t curRow = 0, int32_t curCol = 0)
    {
#if __CCE_AICORE__ == 310
        if constexpr (PhyPosIsUB(INPUT_TYPE::pos)) {
            return;
        }
#endif
        if constexpr (DoMatmulMDL(MM_CFG)) {
            auto bufferPos = MATMUL_MODULE(CopyCubeInParams)->GetBufferPos();
            MATMUL_MODULE(CubeInBuffer)->FreeTensor(bufferPos);
        } else {
            int posL1 = MATMUL_MODULE(CubeInBuffer)->GetIterIndex(curRow, curCol);
            MATMUL_MODULE(CubeInBuffer)->FreeTensor(posL1, aMatrix);
        }
    }

    // Destroy
    __aicore__ inline void Destroy()
    {
        MATMUL_MODULE(CubeInBuffer)->Destroy();
    }

private:
    template <bool IS_TRANS = false>
    __aicore__ inline bool CopyTileToCube(const LocalTensor<TRANS_T>& aMatrix, int curRow, int curCol,
                                          int tileHeight, int tileWidth)
    {
        if constexpr (PhyPosIsUB(INPUT_TYPE::pos)) {
            return CopyTileToCubeFromUB<IS_TRANS>(aMatrix, curRow, curCol, tileHeight, tileWidth);
        } else {
            return CopyTileToCubeFromGM<IS_TRANS>(aMatrix, curRow, curCol, tileHeight, tileWidth);
        }
    }

    template <bool IS_TRANS = false>
    __aicore__ inline bool CopyTileToCubeFromGM(const LocalTensor<TRANS_T>& aMatrix, int curRow, int curCol,
                                                int tileHeight, int tileWidth)
    {
        if constexpr (INPUT_TYPE::format == CubeFormat::ND) {
            GlobalTensor<SRC_T> aGlobal;
            aGlobal.SetGlobalBuffer(MATMUL_MODULE(MatmulTensorInfo)->GetGlobalAddr());
            if constexpr (INPUT_TYPE::TAG == InputTypeTag::B && IsSameTypeV<TRANS_T, int8_t> &&
                          IsSameTypeV<SRC_T, int8_t>) {
                if (!MATMUL_MODULE(MatmulShapeInfo)->IsTranspose()) {
                    CopyND2NZWithTransData<IS_TRANS>(aMatrix, aGlobal, curRow, curCol, tileHeight,
                                                     tileWidth);
                } else {
                    CopyTileToCubeFromGMAndND<IS_TRANS>(aMatrix, aGlobal, curRow, curCol,
                        tileHeight, tileWidth);
                }
            } else {
                CopyTileToCubeFromGMAndND<IS_TRANS>(aMatrix, aGlobal, curRow, curCol,
                    tileHeight, tileWidth);
            }
        } else if constexpr (INPUT_TYPE::format == CubeFormat::NZ) {
            GlobalTensor<TRANS_T> aGlobal;
            aGlobal.SetGlobalBuffer(MATMUL_MODULE(MatmulTensorInfo)->GetGlobalAddr());
            if constexpr (INPUT_TYPE::TAG == InputTypeTag::B && IsSameTypeV<TRANS_T, int8_t> &&
                          IsSameTypeV<SRC_T, int8_t>) {
                if (!MATMUL_MODULE(MatmulShapeInfo)->IsTranspose()) {
                    CopyNZ2NZWithTransData<IS_TRANS>(aMatrix, aGlobal, curRow, curCol, tileHeight,
                                                     tileWidth);
                } else {
                    CopyNZ2NZ(aMatrix, aGlobal, curRow * MATMUL_MODULE(MatmulShapeInfo)->template GetBaseHeight<IS_TRANS>(),
                              curCol * MATMUL_MODULE(MatmulShapeInfo)->template GetBaseWidth<IS_TRANS>(), tileHeight,
                              tileWidth, MATMUL_MODULE(MatmulShapeInfo)->template GetOrgHeight<IS_TRANS>());
                }
            } else {
                CopyNZ2NZ(aMatrix, aGlobal, curRow * MATMUL_MODULE(MatmulShapeInfo)->template GetBaseHeight<IS_TRANS>(),
                          curCol * MATMUL_MODULE(MatmulShapeInfo)->template GetBaseWidth<IS_TRANS>(), tileHeight, tileWidth,
                          MATMUL_MODULE(MatmulShapeInfo)->template GetOrgHeight<IS_TRANS>());
            }
        } else if constexpr (INPUT_TYPE::format == CubeFormat::VECTOR) {
            if (MATMUL_MODULE(MatmulShapeInfo)->IsKRowDirec()) {
                return false;
            }
            GlobalTensor<TRANS_T> aGlobal;
            aGlobal.SetGlobalBuffer(MATMUL_MODULE(MatmulTensorInfo)->GetGlobalAddr());
            CopyVector2A1(aMatrix, aGlobal, curCol * MATMUL_MODULE(MatmulShapeInfo)->template GetBaseWidth<IS_TRANS>(),
                          CeilT<int32_t>(tileWidth, c0Size_));
        } else {
            return false;
        }
        return true;
    }

    template <bool IS_TRANS = false>
    __aicore__ inline void CopyTileToCubeFromGMAndND(const LocalTensor<TRANS_T>& aMatrix,
                                                     GlobalTensor<SRC_T>& aGlobal,
                                                     int curRow, int curCol, int tileHeight, int tileWidth)
    {
        if constexpr (ToMatmulConfig(MM_CFG).enVecND2NZ) {
            CopyND2NZWithVecOp<IS_TRANS>(aMatrix, aGlobal, curRow, curCol, tileHeight, tileWidth);
        } else {
            CopyND2NZOnTheFly(aMatrix, aGlobal, curRow * MATMUL_MODULE(MatmulShapeInfo)->template GetBaseHeight<IS_TRANS>(),
                curCol * MATMUL_MODULE(MatmulShapeInfo)->template GetBaseWidth<IS_TRANS>(), tileHeight, tileWidth,
                MATMUL_MODULE(MatmulShapeInfo)->template GetOrgWidth<IS_TRANS>());
        }
    }

    template <bool IS_TRANS = false>
    __aicore__ inline void CopyND2NZWithVecOp(const LocalTensor<TRANS_T>& aMatrix, const GlobalTensor<SRC_T>& src,
                                              int curRow, int curCol, int tileHeight, int tileWidth)
    {
        if constexpr (ToMatmulConfig(MM_CFG).enableL1CacheUB) {
            static_assert(DoMatmulMDL(MM_CFG), "Only MDL version support L1CacheUB.");
        }
        CopyND2NZ<IS_TRANS>(aMatrix, src, curRow * MATMUL_MODULE(MatmulShapeInfo)->template GetBaseHeight<IS_TRANS>(),
            curCol * MATMUL_MODULE(MatmulShapeInfo)->template GetBaseWidth<IS_TRANS>(), tileHeight, tileWidth,
            MATMUL_MODULE(MatmulShapeInfo)->template GetOrgWidth<IS_TRANS>());
    }

    template <bool IS_TRANS = false>
    __aicore__ inline bool CopyTileToCubeFromUB(const LocalTensor<TRANS_T>& aMatrix, int curRow, int curCol,
                                                int tileHeight, int tileWidth)
    {
        if constexpr (INPUT_TYPE::format == CubeFormat::ND) {
            if constexpr (INPUT_TYPE::TAG == InputTypeTag::B && IsSameTypeV<TRANS_T, int8_t> &&
                          IsSameTypeV<SRC_T, int8_t>) {
                if (!MATMUL_MODULE(MatmulShapeInfo)->IsTranspose()) {
                    LocalTensor<SRC_T> leftMatrix;
                    leftMatrix.SetAddr(MATMUL_MODULE(MatmulTensorInfo)->GetLocalAddr());
                    CopyND2NZWithTransData<IS_TRANS>(aMatrix, leftMatrix, curRow, curCol, tileHeight,
                                                     tileWidth);
                } else {
                    if constexpr (ToMatmulConfig(MM_CFG).enVecND2NZ) {
                        return false;
                    } else {
                        CopyTileToCubeFromUBAndND<IS_TRANS>(aMatrix, curRow, curCol,
                            tileHeight, tileWidth);
                    }
                }
            } else {
                if constexpr (ToMatmulConfig(MM_CFG).enVecND2NZ) {
                    return false;
                } else {
                    CopyTileToCubeFromUBAndND<IS_TRANS>(aMatrix, curRow, curCol,
                        tileHeight, tileWidth);
                }
            }
        } else if constexpr (INPUT_TYPE::format == CubeFormat::NZ) {
            LocalTensor<SRC_T> leftMatrix;
            leftMatrix.SetAddr(MATMUL_MODULE(MatmulTensorInfo)->GetLocalAddr());
            if constexpr (INPUT_TYPE::TAG == InputTypeTag::B && IsSameTypeV<TRANS_T, int8_t> &&
                          IsSameTypeV<SRC_T, int8_t>) {
                if (!MATMUL_MODULE(MatmulShapeInfo)->IsTranspose()) {
                    CopyNZ2NZWithTransData<IS_TRANS>(aMatrix, leftMatrix, curRow, curCol, tileHeight,
                                                     tileWidth);
                } else {
                    CopyNZ2NZ(aMatrix, leftMatrix,
                              curRow * MATMUL_MODULE(MatmulShapeInfo)->template GetBaseHeight<IS_TRANS>(),
                              curCol * MATMUL_MODULE(MatmulShapeInfo)->template GetBaseWidth<IS_TRANS>(), tileHeight,
                              tileWidth, MATMUL_MODULE(MatmulShapeInfo)->template GetOrgHeight<IS_TRANS>());
                }
            } else {
                CopyNZ2NZ(aMatrix, leftMatrix, curRow * MATMUL_MODULE(MatmulShapeInfo)->template GetBaseHeight<IS_TRANS>(),
                          curCol * MATMUL_MODULE(MatmulShapeInfo)->template GetBaseWidth<IS_TRANS>(), tileHeight, tileWidth,
                          MATMUL_MODULE(MatmulShapeInfo)->template GetOrgHeight<IS_TRANS>());
            }
        } else if constexpr (INPUT_TYPE::format == CubeFormat::VECTOR) {
            if (MATMUL_MODULE(MatmulShapeInfo)->IsKRowDirec()) {
                return false;
            }
            LocalTensor<SRC_T> leftMatrix;
            leftMatrix.SetAddr(MATMUL_MODULE(MatmulTensorInfo)->GetLocalAddr());
            CopyVector2A1(aMatrix, leftMatrix, curCol * MATMUL_MODULE(MatmulShapeInfo)->template GetBaseWidth<IS_TRANS>(),
                          CeilT<int32_t>(tileWidth, c0Size_));
        } else {
            return false;
        }
        return true;
    }

    template <bool IS_TRANS = false>
    __aicore__ inline void CopyTileToCubeFromUBAndND(const LocalTensor<TRANS_T>& aMatrix,
                                                     int curRow, int curCol,
                                                     int tileHeight, int tileWidth)
    {
        LocalTensor<SRC_T> leftMatrix;
        leftMatrix.SetAddr(MATMUL_MODULE(MatmulTensorInfo)->GetLocalAddr());
        CopyND2NZOnTheFly(
            aMatrix, leftMatrix, curRow * MATMUL_MODULE(MatmulShapeInfo)->template GetBaseHeight<IS_TRANS>(),
            curCol * MATMUL_MODULE(MatmulShapeInfo)->template GetBaseWidth<IS_TRANS>(), tileHeight, tileWidth,
            MATMUL_MODULE(MatmulShapeInfo)->template GetOrgWidth<IS_TRANS>());
    }

    __aicore__ inline void CopyNZ2NZ(const LocalTensor<TRANS_T>& dst, const GlobalTensor<TRANS_T>& src, const int row,
                                     const int col, const int height, const int width, const int gRow)
    {
        ASCENDC_ASSERT((gRow >= height), {
            KERNEL_LOG(
                KERNEL_ERROR,
                "NZ2NZ height larger than origin matrix height, gRow is %d, which should be no less than height %d.",
                gRow, height);
        });
        int alignedGRow = CeilAlignT<int32_t>(gRow, BLOCK_CUBE);
        int64_t srcOffset = (int64_t)row * (int64_t)c0Size_ + (int64_t)col * (int64_t)alignedGRow;
        // height direction need to be 16 aligned
        auto alignHeight = CeilAlignT<int32_t>(height, BLOCK_CUBE);
        int blockLen = alignHeight * c0Size_ * sizeof(TRANS_T) / ONE_BLK_SIZE;
        int srcStride = (alignedGRow - alignHeight) * (c0Size_ * sizeof(TRANS_T) / ONE_BLK_SIZE);

        if (srcStride >= UINT16_MAX) {
            for (int i = 0; i < CeilT<int32_t>(width, c0Size_); ++i) {
                DataCopy(dst[i * alignHeight * c0Size_], src[srcOffset + i * gRow * c0Size_],
                         { 1, static_cast<uint16_t>(blockLen), 0, 0 });
            }
        } else {
            uint16_t nburst = CeilT<int32_t>(width, c0Size_);
            int dstStride = 0;
            DataCopy(dst, src[srcOffset],
                     { nburst, static_cast<uint16_t>(blockLen), static_cast<uint16_t>(srcStride),
                       static_cast<uint16_t>(dstStride) });
        }
    };

    __aicore__ inline void CopyNZ2NZ(const LocalTensor<TRANS_T>& dst, const LocalTensor<TRANS_T>& src, const int row,
                                     const int col, const int height, const int width, const int gRow)
    {
        ASCENDC_ASSERT((gRow >= height), {
            KERNEL_LOG(KERNEL_ERROR, "gRow is %d, which should be no less than height %d.", gRow, height);
        });
        int srcOffset = row * c0Size_ + col * gRow;
        // height direction need to be 16 aligned
        auto alignHeight = (height + 15) / 16 * 16;
        int blockLen = alignHeight * c0Size_ * sizeof(TRANS_T) / ONE_BLK_SIZE;
        int srcStride = (gRow - alignHeight) * (c0Size_ * sizeof(TRANS_T) / ONE_BLK_SIZE);

        if (srcStride >= UINT16_MAX) {
            for (int i = 0; i < width / c0Size_; ++i) {
                DataCopy(dst[i * alignHeight * c0Size_], src[srcOffset + i * gRow * c0Size_],
                         { 1, static_cast<uint16_t>(blockLen), 0, 0 });
            }
        } else {
            DataCopy(dst, src[srcOffset],
                     { static_cast<uint16_t>(width / c0Size_), static_cast<uint16_t>(blockLen),
                       static_cast<uint16_t>(srcStride), 0 });
        }
    };

    __aicore__ inline void CopyVector2A1(const LocalTensor<TRANS_T>& dst, GlobalTensor<TRANS_T>& src, const int col,
                                         const int blockLen)
    {
        ASCENDC_ASSERT((col >= 0), { KERNEL_LOG(KERNEL_ERROR, "col is %d, which should be no less than 0.", col); });
        ASCENDC_ASSERT((INPUT_TYPE::format == CubeFormat::VECTOR),
                       { KERNEL_LOG(KERNEL_ERROR, "INPUT_TYPE::format should be CubeFormat::VECTOR."); });

        DataCopyParams dataCopyInfo;
        dataCopyInfo.blockCount = 1;
        dataCopyInfo.blockLen = blockLen;
        dataCopyInfo.srcStride = 0;
        dataCopyInfo.dstStride = 0;
        DataCopyEnhancedParams enhancedParams;
        enhancedParams.blockMode = BlockMode::BLOCK_MODE_VECTOR;
        DataCopy(dst, src[col], dataCopyInfo, enhancedParams);
        return;
    };

    __aicore__ inline void CopyVector2A1(const LocalTensor<TRANS_T>& dst, LocalTensor<TRANS_T>& src, const int col,
                                         const int blockLen)
    {
        ASCENDC_ASSERT((col >= 0), { KERNEL_LOG(KERNEL_ERROR, "col is %d, which should be no less than 0.", col); });
        ASCENDC_ASSERT((INPUT_TYPE::format == CubeFormat::VECTOR),
                       { KERNEL_LOG(KERNEL_ERROR, "INPUT_TYPE::format should be CubeFormat::VECTOR."); });

        DataCopyParams dataCopyInfo;
        dataCopyInfo.blockCount = 1;
        dataCopyInfo.blockLen = blockLen;
        dataCopyInfo.srcStride = 0;
        dataCopyInfo.dstStride = 0;
        DataCopy(dst, src[col], dataCopyInfo);
        return;
    };

    template <bool IS_TRANS = false>
    __aicore__ inline void CopyND2NZ(const LocalTensor<TRANS_T>& dst, const GlobalTensor<SRC_T>& src, const int row,
                                     const int col, const int height, const int width, const int gCol)
    {
        int calcWidth = CeilT(width, c0Size_);
        bool isBankConflict = calcWidth * EACH_BLOCK_BYTES_MM_API % CACHE_LINE_SIZE_MM_API == 0 &&
                              calcWidth < EACH_BLOCK_BYTES_MM_API ? true : false;
        int c0Size = c0Size_;
        if constexpr (IsSameTypeV<TRANS_T, half> && IsSameTypeV<SRC_T, int8_t>) {
            c0Size = 32;
        }
        int padWidth = isBankConflict ? Ceil(width, c0Size) + 1 : Ceil(width, c0Size);
        int size = MATMUL_MODULE(MatmulVar)->GetTransLength();
        if constexpr (IsSameTypeV<TRANS_T, half> && IsSameTypeV<SRC_T, int8_t>) {
            size = Ceil(height, c0Size) * padWidth * c0Size * c0Size / AuxGetFactor<TRANS_T>();
        }

        LocalTensor<SRC_T> transTensor;
        transTensor = MATMUL_MODULE(MatmulVar)->GetLocalWorkspace(0).template ReinterpretCast<SRC_T>();
        transTensor.SetSize(size);
        LocalTensor<TRANS_T> trans;
        trans = MATMUL_MODULE(MatmulVar)->GetLocalWorkspace(
            MATMUL_MODULE(MatmulVar)->GetTransLength()).template ReinterpretCast<TRANS_T>();
        trans.SetSize(size);
        int64_t srcOffset = (int64_t)row * (int64_t)gCol + (int64_t)col;

        int calcHigh = CeilT<int32_t>(height, BLOCK_CUBE);
        auto enQueEvtID = GetTPipePtr()->FetchEventID(HardEvent::V_MTE2);
        if constexpr (!ToMatmulConfig(MM_CFG).enableL1CacheUB) {
            SetFlag<HardEvent::V_MTE2>(enQueEvtID);
            WaitFlag<HardEvent::V_MTE2>(enQueEvtID);
        }
        if constexpr (ToMatmulConfig(MM_CFG).enableL1CacheUB) {
            if constexpr (INPUT_TYPE::TAG == InputTypeTag::A) {
                calcWidth = GetANDBlockFromGM<IS_TRANS>(transTensor, src, row, col, height, width, gCol, isBankConflict);
            } else {
                calcWidth = GetBNDBlockFromGM<IS_TRANS>(transTensor, src, row, col, height, width, gCol, isBankConflict);
            }
        } else {
            calcWidth = CopyNDBlock(transTensor, src, srcOffset, height, width, gCol, isBankConflict);
        }

        if constexpr (IsSameTypeV<TRANS_T, half> && IsSameTypeV<SRC_T, int8_t>) {
            static_assert(DoMatmulMDL(MM_CFG), "Only MDL version support AntiQuant.");
            if (!MATMUL_MODULE(MatmulShapeInfo)->IsTranspose()) {
                enQueEvtID = GetTPipePtr()->FetchEventID(HardEvent::MTE2_S);
                SetFlag<HardEvent::MTE2_S>(enQueEvtID);
                WaitFlag<HardEvent::MTE2_S>(enQueEvtID);
            }
            AntiQuantCompute(trans, transTensor, isBankConflict);
            PipeBarrier<PIPE_V>();
            constexpr int32_t padBlock = 2;
            int32_t padWidth = isBankConflict ? calcWidth + padBlock : calcWidth;
            // update fp16 padwidth
            (const_cast<LocalTensor<TRANS_T>&>(dst)).SetSize(size);
            SetMaskNorm();
            NDPadZeros(trans, height, padWidth, gCol, width, isBankConflict);
            LocalTensor<TRANS_T> nzTensor;
            nzTensor = MATMUL_MODULE(MatmulVar)->GetLocalWorkspace(0).template ReinterpretCast<TRANS_T>();
            nzTensor.SetSize(size);
            PipeBarrier<PIPE_V>();
            NDTrans2NZ(nzTensor, trans, calcHigh, calcWidth, isBankConflict);
            enQueEvtID = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3));
            SetFlag<HardEvent::V_MTE3>(enQueEvtID);
            WaitFlag<HardEvent::V_MTE3>(enQueEvtID);
            DataCopy(dst, nzTensor, size);
            enQueEvtID = GetTPipePtr()->FetchEventID(HardEvent::MTE3_MTE2);
            SetFlag<HardEvent::MTE3_MTE2>(enQueEvtID);
            WaitFlag<HardEvent::MTE3_MTE2>(enQueEvtID);
        } else {
            int padWidth = isBankConflict ? calcWidth + 1 : calcWidth;
            int size = calcHigh * padWidth * BLOCK_CUBE * c0Size_ / AuxGetFactor<TRANS_T>();
            transTensor.SetSize(size);
            trans.SetSize(size);
            (const_cast<LocalTensor<TRANS_T>&>(dst)).SetSize(size);
            NDPadZeros(transTensor, height, padWidth, gCol, width, isBankConflict);
            NDTrans2NZ(trans, transTensor, calcHigh, calcWidth, isBankConflict);
            enQueEvtID = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3));
            SetFlag<HardEvent::V_MTE3>(enQueEvtID);
            WaitFlag<HardEvent::V_MTE3>(enQueEvtID);
            DataCopy(dst, trans, size);
            enQueEvtID = GetTPipePtr()->FetchEventID(HardEvent::MTE3_V);
            SetFlag<HardEvent::MTE3_V>(enQueEvtID);
            WaitFlag<HardEvent::MTE3_V>(enQueEvtID);
        }
        return;
    };

    template <bool IS_TRANS = false>
    __aicore__ inline int32_t GetANDBlockFromGM(const LocalTensor<SRC_T>& transTensor, const GlobalTensor<SRC_T>& src,
                                                const int row, const int col, const int height, const int width,
                                                const int gCol, const bool isBankConflict)
    {
        auto enQueEvtID = GetTPipePtr()->FetchEventID(HardEvent::MTE1_MTE2);
        SetFlag<HardEvent::MTE1_MTE2>(enQueEvtID);
        WaitFlag<HardEvent::MTE1_MTE2>(enQueEvtID);
        int64_t srcOffset = (int64_t)row * (int64_t)gCol + (int64_t)col;

        uint32_t cacheA1Size =
            MATMUL_MODULE(CopyCubeInParams)->GetStepCol() * MATMUL_MODULE(CopyCubeInParams)->GetStepRow() * MATMUL_MODULE(CopyCubeInParams)->GetBufferSize();
        int calcWidth = CeilT(width, c0Size_);
        if (cache2UBProc_ == 0 || cache2UBProc_ >= MATMUL_MODULE(MatmulVar)->GetDepthL1CacheUB()) {
            if (cache2UBProc_ == 0) {
                cacheHead2UB_ = qidUBCache_.template AllocTensor<TRANS_T>();
            } else {
                qidUBCache_.FreeTensor(cacheHead2UB_);
                cacheHead2UB_ = qidUBCache_.template AllocTensor<TRANS_T>(); // To use que to insert events
            }
            if (IsL1KFullLoad()) {
                for (int i = 0; i < MATMUL_MODULE(MatmulVar)->GetDepthL1CacheUB(); ++i) {
                    if (IsBufferPosEnd(i)) {
                        break;
                    }
                    int copyHeight = GetCopyHeight(i);
                    auto a1CacheUb = cacheHead2UB_[i * cacheA1Size];
                    calcWidth = CopyNDBlock(a1CacheUb, src, srcOffset, copyHeight, width, gCol, isBankConflict);
                    if (MATMUL_MODULE(MatmulShapeInfo)->IsTranspose()) {
                        srcOffset += MATMUL_MODULE(CopyCubeInParams)->GetStepCol() *
                                     MATMUL_MODULE(MatmulShapeInfo)->template GetBaseHeight();
                    } else {
                        srcOffset += MATMUL_MODULE(CopyCubeInParams)->GetStepCol() *
                                     MATMUL_MODULE(MatmulShapeInfo)->template GetBaseHeight() * (int64_t)gCol;
                    }
                }
            } else {
                for (int i = 0; i < MATMUL_MODULE(MatmulVar)->GetDepthL1CacheUB(); ++i) {
                    if (IsBufferKPosEnd(i)) {
                        break;
                    }
                    int copyWidth = GetCopyWidth(i, MATMUL_MODULE(MatmulShapeInfo)->template GetBaseWidth<IS_TRANS>());
                    auto a1CacheUb = cacheHead2UB_[i * cacheA1Size];
                    calcWidth = CopyNDBlock(a1CacheUb, src, srcOffset, height, copyWidth, gCol, isBankConflict);
                    if (MATMUL_MODULE(MatmulShapeInfo)->IsTranspose()) {
                        srcOffset += MATMUL_MODULE(CopyCubeInParams)->GetStepRow() *
                                     MATMUL_MODULE(MatmulShapeInfo)->template GetBaseHeight<IS_TRANS>() * (int64_t)gCol;
                    } else {
                        srcOffset += MATMUL_MODULE(CopyCubeInParams)->GetStepRow() *
                                     MATMUL_MODULE(MatmulShapeInfo)->template GetBaseWidth<IS_TRANS>();
                    }
                }
            }
            cache2UBProc_ = 0;
            auto mte2ToMte1EvtID = GetTPipePtr()->FetchEventID(HardEvent::MTE2_MTE1);
            SetFlag<HardEvent::MTE2_MTE1>(mte2ToMte1EvtID);
            WaitFlag<HardEvent::MTE2_MTE1>(mte2ToMte1EvtID);
        }
        // fetch data from Cache
        uint16_t blockLen = cacheA1Size * sizeof(SRC_T) / ONE_BLK_SIZE;

        auto vToMte1EvtID = GetTPipePtr()->FetchEventID(HardEvent::V_MTE1);
        SetFlag<HardEvent::V_MTE1>(vToMte1EvtID);
        WaitFlag<HardEvent::V_MTE1>(vToMte1EvtID);
        DataCopy(transTensor, cacheHead2UB_[cache2UBProc_ * cacheA1Size], { 1, static_cast<uint16_t>(blockLen), 0, 0 });
        auto mte1ToVEvtID = GetTPipePtr()->FetchEventID(HardEvent::MTE1_V);
        SetFlag<HardEvent::MTE1_V>((event_t)mte1ToVEvtID);
        WaitFlag<HardEvent::MTE1_V>((event_t)mte1ToVEvtID);
        ++cache2UBProc_;
        if (IsL1KFullLoad()) {
            if (IsBufferPosEnd()) {
                cache2UBProc_ = 0;
                qidUBCache_.FreeTensor(cacheHead2UB_);
            }
        } else {
            if (IsBufferKPosEnd()) {
                cache2UBProc_ = 0;
                qidUBCache_.FreeTensor(cacheHead2UB_);
            }
        }
        return calcWidth;
    }

    template <bool IS_TRANS = false>
    __aicore__ inline int32_t GetBNDBlockFromGM(const LocalTensor<SRC_T>& transTensor, const GlobalTensor<SRC_T>& src,
                                                const int row, const int col, const int height, const int width,
                                                const int gCol, const bool isBankConflict)
    {
        auto enQueEvtID = GetTPipePtr()->FetchEventID(HardEvent::MTE1_MTE2);
        SetFlag<HardEvent::MTE1_MTE2>(enQueEvtID);
        WaitFlag<HardEvent::MTE1_MTE2>(enQueEvtID);

        int64_t srcOffset = (int64_t)row * (int64_t)gCol + (int64_t)col;
        uint32_t cacheB1Size =
            MATMUL_MODULE(CopyCubeInParams)->GetStepCol() * MATMUL_MODULE(CopyCubeInParams)->GetStepRow() * MATMUL_MODULE(CopyCubeInParams)->GetBufferSize();

        int calcWidth = CeilT(width, c0Size_);

        if (cache2UBProc_ == 0 || cache2UBProc_ >= MATMUL_MODULE(MatmulVar)->GetDepthL1CacheUB()) {
            if (cache2UBProc_ == 0) {
                cacheHead2UB_ = qidUBCache_.template AllocTensor<SRC_T>();
            } else {
                qidUBCache_.FreeTensor(cacheHead2UB_);
                cacheHead2UB_ = qidUBCache_.template AllocTensor<SRC_T>(); // To use que to insert events
            }
            if (IsL1KFullLoad()) {
                for (int i = 0; i < MATMUL_MODULE(MatmulVar)->GetDepthL1CacheUB(); ++i) {
                    if (IsBufferPosEnd(i)) {
                        break;
                    }
                    int copyWidth = GetCopyWidth(i, MATMUL_MODULE(MatmulShapeInfo)->template GetBaseHeight<IS_TRANS>());
                    auto b1CacheUb = cacheHead2UB_[i * cacheB1Size];
                    calcWidth = CopyNDBlock(b1CacheUb, src, srcOffset, height, copyWidth, gCol, isBankConflict);
                    srcOffset += MATMUL_MODULE(CopyCubeInParams)->GetStepCol() *
                                 MATMUL_MODULE(MatmulShapeInfo)->template GetBaseHeight<IS_TRANS>() * (int64_t)gCol;
                }
            } else {
                for (int i = 0; i < MATMUL_MODULE(MatmulVar)->GetDepthL1CacheUB(); ++i) {
                    if (IsBufferKPosEnd(i)) {
                        break;
                    }
                    int copyHeight =
                        GetCopyWidth(i, MATMUL_MODULE(MatmulShapeInfo)->template GetBaseHeight<IS_TRANS>());
                    auto b1CacheUb = cacheHead2UB_[i * cacheB1Size];
                    calcWidth = CopyNDBlock(b1CacheUb, src, srcOffset, copyHeight, width, gCol, isBankConflict);
                    srcOffset += MATMUL_MODULE(CopyCubeInParams)->GetStepRow() *
                                 MATMUL_MODULE(MatmulShapeInfo)->template GetBaseWidth<IS_TRANS>();
                }
            }
            cache2UBProc_ = 0;
            auto mte2ToMte1EvtID = GetTPipePtr()->FetchEventID(HardEvent::MTE2_MTE1);
            SetFlag<HardEvent::MTE2_MTE1>(mte2ToMte1EvtID);
            WaitFlag<HardEvent::MTE2_MTE1>(mte2ToMte1EvtID);
        }
        // fetch data from Cache
        uint16_t blockLen = cacheB1Size * sizeof(SRC_T) / ONE_BLK_SIZE;
        auto vToMte1EvtID = GetTPipePtr()->FetchEventID(HardEvent::V_MTE1);
        SetFlag<HardEvent::V_MTE1>(vToMte1EvtID);
        WaitFlag<HardEvent::V_MTE1>(vToMte1EvtID);
        DataCopy(transTensor, cacheHead2UB_[cache2UBProc_ * cacheB1Size], { 1, static_cast<uint16_t>(blockLen), 0, 0 });
        auto mte1ToVEvtID = GetTPipePtr()->FetchEventID(HardEvent::MTE1_V);
        SetFlag<HardEvent::MTE1_V>((event_t)mte1ToVEvtID);
        WaitFlag<HardEvent::MTE1_V>((event_t)mte1ToVEvtID);
        ++cache2UBProc_;
        if (IsL1KFullLoad()) {
            if (IsBufferPosEnd()) {
                cache2UBProc_ = 0;
                qidUBCache_.FreeTensor(cacheHead2UB_);
            }
        } else {
            if (IsBufferKPosEnd()) {
                cache2UBProc_ = 0;
                qidUBCache_.FreeTensor(cacheHead2UB_);
            }
        }
        return calcWidth;
    }

    // v100, v200
    __aicore__ inline int CopyNDBlock(const LocalTensor<SRC_T>& transTensor, const GlobalTensor<SRC_T>& src,
                                      int64_t srcOffset, const int height, const int width, const int gCol,
                                      const bool isBankConflict)
    {
        ASCENDC_ASSERT((gCol >= width),
                       { KERNEL_LOG(KERNEL_ERROR, "gCol is %d, which should be no less than %d.", gCol, width); });
        int32_t oriC0Size = AuxGetC0Size<SRC_T>();
        int32_t calcWidthExr = CeilAlignT<int32_t>(width, oriC0Size);
        int32_t calcWidth = CeilT<int32_t>(calcWidthExr, c0Size_);

        // gCol unaligned
        if (gCol % oriC0Size) {
            int blockLen = calcWidthExr * sizeof(SRC_T) / DEFAULT_C0_SIZE;
            int dstOffset = 0;
            int BankConflictPadSize = isBankConflict ? (EACH_BLOCK_BYTES_MM_API / sizeof(SRC_T)) : 0;

            // data copy stride is unaligned, need to copy line by line
            for (int i = 0; i < height; i++) {
                DataCopy(transTensor[dstOffset], src[srcOffset], { 1, static_cast<uint16_t>(blockLen), 0, 0 });
                dstOffset += (calcWidthExr + BankConflictPadSize);
                srcOffset += gCol;
            }

            auto enQueEvtID = GetTPipePtr()->FetchEventID(HardEvent::MTE2_V);
            SetFlag<HardEvent::MTE2_V>((event_t)enQueEvtID);
            WaitFlag<HardEvent::MTE2_V>((event_t)enQueEvtID);
        } else {
            int srcStride = (gCol - width) * sizeof(SRC_T) / ONE_BLK_SIZE;
            int blocklen = CeilT<int32_t>(width * sizeof(SRC_T), ONE_BLK_SIZE);
            if (srcStride >= UINT16_MAX) {
                int dstOffset = isBankConflict ? (width + oriC0Size) : width;
                for (int i = 0; i < height; ++i) {
                    DataCopy(transTensor[i * dstOffset], src[srcOffset], { 1, static_cast<uint16_t>(blocklen), 0, 0 });
                    srcOffset += gCol;
                }
            } else {
                uint16_t dstStride = isBankConflict ? 1 : 0;
                int loopNum = CeilT<int32_t>(static_cast<uint16_t>(height), MAX_BLOCK_COUNT_SIZE_MM_API);
                int tailCount = static_cast<uint16_t>(height) % MAX_BLOCK_COUNT_SIZE_MM_API;
                for (int i = 0; i < loopNum; ++i) {
                    uint16_t blockCount = (i == loopNum - 1) ? tailCount : MAX_BLOCK_COUNT_SIZE_MM_API;
                    DataCopy(
                        transTensor[i * MAX_BLOCK_COUNT_SIZE_MM_API * blocklen * ONE_BLK_SIZE / sizeof(SRC_T)],
                        src[srcOffset + i * MAX_BLOCK_COUNT_SIZE_MM_API * blocklen * ONE_BLK_SIZE / sizeof(SRC_T)],
                        { blockCount, static_cast<uint16_t>(blocklen), static_cast<uint16_t>(srcStride), dstStride });
                }
            }
            auto enQueEvtID = GetTPipePtr()->FetchEventID(HardEvent::MTE2_V);
            SetFlag<HardEvent::MTE2_V>((event_t)enQueEvtID);
            WaitFlag<HardEvent::MTE2_V>((event_t)enQueEvtID);
        }
        return calcWidth;
    }

    // v100, v200
    /**
     * @description: Pad zeros for the ND matrix for width.
     * @param: dst: LocalTensor in L1.
     * @param: height: Height of the tile to be loaded.
     * @param: calcWidth: Width of the tile to be calculated.
     * @param: tail: tailSize.
     * @param: offset: offset from head addr to the block.
     * @return: void
     */
    __aicore__ inline void NDPadZeroForWidth(LocalTensor<TRANS_T>& dst,
        const int height, const int calcWidth, const int tail, int offset)
    {
        uint16_t mask_tail_16bit = ~((1 << tail) - 1);
        uint64_t mask_tail_64bit = mask_tail_16bit;
        if (mask_tail_64bit == 0) {
            return;
        }
        uint64_t mask[2];
        mask[0] = mask_tail_64bit + (mask_tail_64bit << FIRST_16BIT_OFFSET_MM_API) +
                    (mask_tail_64bit << SECOND_16BIT_OFFSET_MM_API) + (mask_tail_64bit << THIRD_16BIT_OFFSET_MM_API);
        mask[1] = mask[0];
        int stride = calcWidth * (c0Size_ * sizeof(TRANS_T) / DEFAULT_C0_SIZE);
        int32_t totalRep = CeilT<int32_t>(height, REPEAT_BLOCK_NUM_MM_API);
        if constexpr (IsSameTypeV<TRANS_T, int8_t>) {
            LocalTensor<int16_t> tmpTransTensor = dst.template ReinterpretCast<int16_t>();
            if (stride < EACH_BLOCK_BYTES_MM_API) {
                if (totalRep <= MAX_REPEAT_TIMES) {
                    Duplicate(tmpTransTensor[offset], (int16_t)0, mask,
                        CeilT<int32_t>(height, REPEAT_BLOCK_NUM_MM_API), stride, REPEAT_BLOCK_NUM_MM_API * stride);
                } else {
                    int32_t highBlock = totalRep / MAX_REPEAT_TIMES;
                    int32_t highTail = totalRep % MAX_REPEAT_TIMES;
                    int64_t dstOffset = calcWidth * BLOCK_CUBE * REPEAT_BLOCK_NUM_MM_API * MAX_REPEAT_TIMES;
                    for (int32_t idx = 0; idx < highBlock; ++idx) {
                        Duplicate(tmpTransTensor[offset], (int16_t)0, mask,
                            MAX_REPEAT_TIMES, stride, REPEAT_BLOCK_NUM_MM_API * stride);
                        offset += dstOffset;
                    }
                    if (highTail) {
                        Duplicate(tmpTransTensor[offset], (int16_t)0, mask, highTail,
                            stride, REPEAT_BLOCK_NUM_MM_API * stride);
                    }
                }
            } else {
                for (int32_t i = 0; i < totalRep; ++i) {
                    Duplicate(tmpTransTensor[offset], (int16_t)0, mask, 1, stride, 0);
                    offset += stride * BLOCK_CUBE;
                }
            }
        } else {
            Duplicate(dst[offset], (TRANS_T)0, mask, totalRep, stride, REPEAT_BLOCK_NUM_MM_API * stride);
        }
        PipeBarrier<PIPE_V>();
    }

    // v100, v200
    /**
     * @description: Pad zeros for the ND matrix.
     * @param: dst: LocalTensor in L1.
     * @param: height: Height of the tile to be loaded.
     * @param: calcWidth: Width of the tile to be calculated.
     * @param: gCol: Origin matrix width.
     * @param: width: Width of the tile to be loaded.
     * @param: isBankConflict: The flag of whether is the bank conflict scene.
     * @return: void
     */
    __aicore__ inline void NDPadZeros(LocalTensor<TRANS_T>& dst, const int height, const int calcWidth, const int gCol,
                                      const int width, bool isBankConflict)
    {
        int tail = width % c0Size_;
        if ((gCol % BLOCK_CUBE != 0) && (tail != 0)) {
            // tail pad zero
            constexpr int32_t DIV_TWO = 2;
            auto offset = width / c0Size_ * c0Size_;
            if constexpr (IsSameType<TRANS_T, int8_t>::value) {
                tail = CeilT(tail, DIV_TWO);
                offset /= DIV_TWO;
            }
            NDPadZeroForWidth(dst, height, calcWidth, tail, offset);
        }
        // If the value of high is not an integer multiple of 16, add 0.
        int tailHigh = height % BLOCK_CUBE;
        if (tailHigh) {
            auto dstOffset = height * calcWidth * BLOCK_CUBE;
            if constexpr (IsSameTypeV<TRANS_T, int8_t>) {
                LocalTensor<int16_t> tmpDst = dst.template ReinterpretCast<int16_t>();
                Duplicate(tmpDst[dstOffset], (int16_t)0, (BLOCK_CUBE - tailHigh) * calcWidth * BLOCK_CUBE);
            } else {
                Duplicate(dst[dstOffset], (TRANS_T)0, (BLOCK_CUBE - tailHigh) * calcWidth * BLOCK_CUBE);
            }
        }
    }

    // v100, v200
    /**
     * @description: NDTrans2NZForInt8.
     * @param: dst: LocalTensor in L1.
     * @param: src: GlobalTensor in GM.
     * @param: calcHigh: Height of the tile to be calculated.
     * @param: calcWidth: Width of the tile to be calculated.
     * @param: isBankConflict: The flag of whether is the bank conflict scene.
     * @return: void
     */
    __aicore__ inline void NDTrans2NZForInt8(LocalTensor<TRANS_T>& dst,
        LocalTensor<TRANS_T>& src, const int calcHigh, const int calcWidth, const bool isBankConflict)
    {
        struct UnaryRepeatParams intriParams;
        uint64_t mask[2] = { uint64_t(-1), uint64_t(-1) };
        int blkStride = isBankConflict ? calcWidth + 1 : calcWidth;
        intriParams.dstBlkStride = (c0Size_ * sizeof(TRANS_T) / DEFAULT_C0_SIZE);
        intriParams.srcBlkStride = blkStride * (c0Size_ * sizeof(TRANS_T) / DEFAULT_C0_SIZE);
        intriParams.dstRepStride = intriParams.dstBlkStride * DEFAULT_BLK_NUM;
        intriParams.srcRepStride = intriParams.srcBlkStride * DEFAULT_BLK_NUM;
        int dstOffset = 0;
        int srcOffset = 0;
        // ensure rep stride be less than 256
        constexpr int maxSrcBlkStride = 32;
        LocalTensor<int16_t> tmpSrc = src.template ReinterpretCast<int16_t>();
        LocalTensor<int16_t> tmpDst = dst.template ReinterpretCast<int16_t>();
        if (intriParams.srcBlkStride >= maxSrcBlkStride) {
            intriParams.dstBlkStride = 1;
            intriParams.srcBlkStride = 1;
            mask[0] = (1 << BLOCK_CUBE) - 1;
            mask[1] = 0;
            SetVectorMask<int16_t>(mask[1], mask[0]);
            for (int i = 0; i < calcWidth; i++) {
                for (int j = 0; j < calcHigh * BLOCK_CUBE; ++j) {
                    dstOffset = i * calcHigh * CUBE_MAX_SIZE + j * BLOCK_CUBE;
                    srcOffset = j * blkStride * BLOCK_CUBE + i * BLOCK_CUBE;
                    Muls<int16_t, false>(tmpDst[dstOffset], tmpSrc[srcOffset], (int16_t)1, mask, 1, intriParams);
                }
            }
        } else {
            SetVectorMask<int16_t>(mask[1], mask[0]);
            int32_t totalRepTimes = 2 * calcHigh;
            int32_t highBlock = totalRepTimes / MAX_REPEAT_TIMES;
            int32_t highTail = totalRepTimes % MAX_REPEAT_TIMES;
            for (int i = 0; i < calcWidth; i++) {
                dstOffset = i * calcHigh * CUBE_MAX_SIZE;
                srcOffset = i * BLOCK_CUBE;
                for (int32_t idx = 0; idx < highBlock; ++idx) {
                    Muls<int16_t, false>(tmpDst[dstOffset], tmpSrc[srcOffset], (int16_t)1, mask, MAX_REPEAT_TIMES,
                                            intriParams);
                    dstOffset += BLOCK_CUBE * MAX_REPEAT_TIMES * REPEAT_BLOCK_NUM_MM_API;
                    srcOffset += calcWidth * BLOCK_CUBE * MAX_REPEAT_TIMES * REPEAT_BLOCK_NUM_MM_API;
                }
                if (highTail) {
                    Muls<int16_t, false>(tmpDst[dstOffset], tmpSrc[srcOffset], (int16_t)1, mask, highTail,
                                            intriParams);
                }
            }
        }
    }

    // v100, v200
    /**
     * @description: NDTrans2NZForFP16.
     * @param: dst: LocalTensor in L1.
     * @param: src: GlobalTensor in GM.
     * @param: calcHigh: Height of the tile to be calculated.
     * @param: calcWidth: Width of the tile to be calculated.
     * @param: isBankConflict: The flag of whether is the bank conflict scene.
     * @return: void
     */
    __aicore__ inline void NDTrans2NZForFP16(LocalTensor<TRANS_T>& dst,
        LocalTensor<TRANS_T>& src, const int calcHigh, const int calcWidth, const bool isBankConflict)
    {
        const int c0Count = AscendCUtils::GetC0Count(sizeof(TRANS_T));
        struct UnaryRepeatParams intriParams;
        uint64_t mask[2] = { uint64_t(-1), uint64_t(-1) };
        int32_t padBlock = 1;
        constexpr int32_t BLOCK_NUM = 2;
        if constexpr (IsSameTypeV<TRANS_T, half> && IsSameTypeV<SRC_T, int8_t>) {
            padBlock = BLOCK_NUM;
        }
        int blkStride = isBankConflict ? calcWidth + padBlock : calcWidth;
        intriParams.dstBlkStride = (BLOCK_CUBE * sizeof(TRANS_T) / DEFAULT_C0_SIZE);
        intriParams.srcBlkStride = blkStride * BLOCK_CUBE * sizeof(TRANS_T) / DEFAULT_C0_SIZE;
        intriParams.dstRepStride = intriParams.dstBlkStride * DEFAULT_BLK_NUM;
        intriParams.srcRepStride = intriParams.srcBlkStride * DEFAULT_BLK_NUM;
        int dstOffset = 0;
        int srcOffset = 0;
        // ensure rep stride be less than 256
        constexpr int maxSrcBlkStride = 32;
        if (intriParams.srcBlkStride >= maxSrcBlkStride) {
            intriParams.dstBlkStride = 1;
            intriParams.srcBlkStride = 1;
            mask[0] = (1 << BLOCK_CUBE) - 1;
            mask[1] = 0;
            SetVectorMask<TRANS_T>(mask[1], mask[0]);
            for (int i = 0; i < calcWidth; i++) {
                for (int j = 0; j < calcHigh * BLOCK_CUBE; ++j) {
                    dstOffset = i * calcHigh * CUBE_MAX_SIZE + j * BLOCK_CUBE;
                    srcOffset = j * blkStride * BLOCK_CUBE + i * BLOCK_CUBE;
                    Muls<TRANS_T, false>(dst[dstOffset], src[srcOffset], (TRANS_T)1, mask, 1, intriParams);
                    if constexpr (sizeof(TRANS_T) == sizeof(float)) {
                        Muls<TRANS_T, false>(dst[dstOffset + c0Count], src[srcOffset + c0Count], (TRANS_T)1, mask,
                                                1, intriParams);
                    }
                }
            }
        } else {
            SetVectorMask<TRANS_T>(mask[1], mask[0]);
            for (int i = 0; i < calcWidth; i++) {
                dstOffset = i * calcHigh * CUBE_MAX_SIZE;
                srcOffset = i * BLOCK_CUBE;
                Muls<TRANS_T, false>(dst[dstOffset], src[srcOffset], (TRANS_T)1, mask, BLOCK_NUM * calcHigh, intriParams);
                if constexpr (sizeof(TRANS_T) == sizeof(float)) {
                    Muls<TRANS_T, false>(dst[dstOffset + c0Count], src[srcOffset + c0Count], (TRANS_T)1, mask,
                                            BLOCK_NUM * calcHigh, intriParams);
                }
            }
        }
    }

    // v100, v200
    /**
     * @description: NDTrans2NZ.
     * @param: dst: LocalTensor in L1.
     * @param: src: GlobalTensor in GM.
     * @param: calcHigh: Height of the tile to be calculated.
     * @param: calcWidth: Width of the tile to be calculated.
     * @param: isBankConflict: The flag of whether is the bank conflict scene.
     * @return: void
     */
    __aicore__ inline void NDTrans2NZ(LocalTensor<TRANS_T>& dst, LocalTensor<TRANS_T>& src, const int calcHigh,
                                      const int calcWidth, const bool isBankConflict)
    {
        // Use Muls, convert to NZ format
        if constexpr (IsSameTypeV<TRANS_T, int8_t>) {
            NDTrans2NZForInt8(dst, src, calcHigh, calcWidth, isBankConflict);
        } else {
            NDTrans2NZForFP16(dst, src, calcHigh, calcWidth, isBankConflict);
        }
    }

    // v100, v200
    __aicore__ inline void CopyND2NZOnTheFly(const LocalTensor<TRANS_T>& dst, GlobalTensor<SRC_T>& src, const int row,
                                             const int col, const int height, const int width, const int gCol)
    {
        ASSERT(gCol >= width && "Copy ND block gm->ub width larger than origin matrix width.");
        int calcWidth = width / c0Size_; // cube block numbers that do not need to be pad zero
        int tail = width % c0Size_;
        int dstOffset = 0;
        int64_t srcOffset = ((int64_t)row * (int64_t)gCol + (int64_t)col);
        int calcWidthExr = CeilT<int32_t>(width, c0Size_);
        int calcHeightExr = CeilT<int32_t>(height, BLOCK_CUBE);

#if __CCE_AICORE__ == 200
        // set2d, pad tail zero
        if (height % BLOCK_CUBE != 0) {
            int64_t repeat = calcWidthExr * calcHeightExr;
            InitConstValueParams<TRANS_T> initConstValueParams;
            initConstValueParams.repeatTimes = (uint16_t)repeat;
            initConstValueParams.initValue = 0;
            InitConstValue(dst, initConstValueParams);
            PipeBarrier<PIPE_MTE2>();
        }
#endif

        // gCol unaligned, can not use dma copy repeat stride
        if (tail != 0) {
            // tail elements that need to be pad zero
            int blockLen = calcWidthExr * (c0Size_ * sizeof(TRANS_T) / DEFAULT_C0_SIZE);

            // gm->l1
            int srcGap = gCol * sizeof(TRANS_T) / ONE_BLK_SIZE - 1;
            if (gCol % c0Size_ || srcGap >= UINT16_MAX) {
                // each block len is only 32B
                for (int i = 0; i < calcWidth; i++) {
                    for (int j = 0; j < height; j++) {
                        DataCopy(dst[dstOffset + i * calcHeightExr * BLOCK_CUBE * c0Size_ + j * c0Size_],
                                 src[srcOffset + j * gCol + i * c0Size_], { 1, 1, 0, 0 });
                    }
                }
            } else {
                // data copy stride is aligned
                for (int i = 0; i < calcWidth; i++) {
                    DataCopy(dst[dstOffset], src[srcOffset],
                             { static_cast<uint16_t>(height), 1, static_cast<uint16_t>(srcGap), 0 });
                    dstOffset += calcHeightExr * BLOCK_CUBE * c0Size_;
                    srcOffset += c0Size_;
                }
            }

            LocalTensor<TRANS_T> trans;
            // tail gm->ub pad zero, and then ub->l1

            trans = MATMUL_MODULE(MatmulVar)
                        ->GetLocalWorkspace(MATMUL_MODULE(MatmulVar)->GetNd2NzOffset())
                        .template ReinterpretCast<TRANS_T>();
            int tranSize = width * EACH_BLOCK_BYTES_MM_API / sizeof(TRANS_T);
            trans.SetSize(tranSize);

            int64_t tailSrcoffset = (int64_t)row * (int64_t)gCol + (int64_t)col + (int64_t)calcWidth * (int64_t)c0Size_;

            // gm->ub
            for (int i = 0; i < height; i++) {
                DataCopy(trans[i * c0Size_], src[tailSrcoffset], { 1, 1, 0, 0 });
                tailSrcoffset += gCol;
            }

            event_t eventIDMte2ToV = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE2_V));
            SetFlag<HardEvent::MTE2_V>(eventIDMte2ToV);
            WaitFlag<HardEvent::MTE2_V>(eventIDMte2ToV);

            // tail pad zero
            uint64_t mask[2];
            constexpr int32_t DIV_TWO = 2;
            if constexpr (IsSameTypeV<TRANS_T, int8_t>) {
                tail = CeilT<int32_t>(tail, DIV_TWO);
            }
            uint16_t mask_tail_16bit = ~((1 << tail) - 1);
            uint64_t mask_tail_64bit = mask_tail_16bit;
            mask[0] = mask_tail_64bit + (mask_tail_64bit << FIRST_16BIT_OFFSET_MM_API) +
                (mask_tail_64bit << SECOND_16BIT_OFFSET_MM_API) + (mask_tail_64bit << THIRD_16BIT_OFFSET_MM_API);
            mask[1] = mask[0];
            constexpr int32_t DUP_CEIL_NUM = 8;
            if constexpr (IsSameTypeV<TRANS_T, int8_t>) {
                LocalTensor<int16_t> tmpTrans = trans.template ReinterpretCast<int16_t>();
                Duplicate(tmpTrans, (int16_t)0, mask, CeilT<int32_t>(height, DUP_CEIL_NUM), 1, DUP_CEIL_NUM);
            } else {
                Duplicate(trans, (TRANS_T)0, mask, CeilT<int32_t>(height, DUP_CEIL_NUM), 1, DUP_CEIL_NUM);
            }

            event_t eventIDVToMte3 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3));
            SetFlag<HardEvent::V_MTE3>(eventIDVToMte3);
            WaitFlag<HardEvent::V_MTE3>(eventIDVToMte3);

            // ub->l1
            int heightAlignBlock = CeilT<int32_t>(height, BLOCK_CUBE);
            int tailDstOffset = heightAlignBlock * BLOCK_CUBE * c0Size_ * calcWidth;
            DataCopy(dst[tailDstOffset], trans, { static_cast<uint16_t>(height), 1, 0, 0 });
        } else {
            int srcGap = gCol * sizeof(TRANS_T) / ONE_BLK_SIZE - 1;
            if (gCol % c0Size_ != 0 || srcGap >= UINT16_MAX) {
                int64_t oriSrcOffset = srcOffset;
                int oriDstOffset = dstOffset;
                // each block len is only 32B
                for (int i = 0; i < calcWidth; i++) {
                    for (int j = 0; j < height; j++) {
                        DataCopy(dst[dstOffset], src[srcOffset], { 1, 1, 0, 0 });
                        dstOffset += c0Size_;
                        srcOffset += gCol;
                    }
                    srcOffset = oriSrcOffset + (i + 1) * c0Size_;
                    dstOffset = oriDstOffset + (i + 1) * calcHeightExr * BLOCK_CUBE * c0Size_;
                }
            } else {
                // data copy stride is aligned
                for (int i = 0; i < calcWidth; i++) {
                    DataCopy(dst[dstOffset], src[srcOffset],
                             { static_cast<uint16_t>(height), 1, static_cast<uint16_t>(srcGap), 0 });
                    dstOffset += calcHeightExr * BLOCK_CUBE * c0Size_;
                    srcOffset += c0Size_;
                }
            }
            event_t eventIDMte2ToMte1 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE2_MTE1));
            SetFlag<HardEvent::MTE2_MTE1>(eventIDMte2ToMte1);
            WaitFlag<HardEvent::MTE2_MTE1>(eventIDMte2ToMte1);
        }
    }

    // v100, v200
    __aicore__ inline void CopyND2NZOnTheFly(const LocalTensor<TRANS_T>& dst, LocalTensor<SRC_T>& src, const int row,
                                             const int col, const int height, const int width, const int gCol)
    {
        ASSERT(gCol >= width && "Copy ND block ub->ub width larger than origin matrix width.");
        int calcWidth = width / c0Size_; // cube block numbers that do not need to be pad zero
        int tail = width % c0Size_;
        int dstOffset = 0;
        int srcOffset = (row * gCol + col);
        int calcWidthExr = CeilT<int32_t>(width, c0Size_);
        int calcHeightExr = CeilT<int32_t>(height, BLOCK_CUBE);

#if __CCE_AICORE__ == 200
        // set2d, pad tail zero
        if (height % BLOCK_CUBE != 0) {
            int64_t repeat = calcWidthExr * calcHeightExr;
            InitConstValueParams<TRANS_T> initConstValueParams;
            initConstValueParams.repeatTimes = (uint16_t)repeat;
            initConstValueParams.initValue = 0;
            InitConstValue(dst, initConstValueParams);

            event_t eventIDMte2ToMte3 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE2_MTE3));
            SetFlag<HardEvent::MTE2_MTE3>(eventIDMte2ToMte3);
            WaitFlag<HardEvent::MTE2_MTE3>(eventIDMte2ToMte3);
        }
#endif

        DataCopyEnhancedParams enhancedParams;
        enhancedParams.blockMode = BlockMode::BLOCK_MODE_VECTOR;

        // gCol unaligned, can not use dma copy repeat stride
        if (tail != 0) {
            // tail elements that need to be pad zero
            int blockLen = calcWidthExr * (c0Size_ * sizeof(TRANS_T) / DEFAULT_C0_SIZE);

            // ub->l1
            int srcGap = gCol * sizeof(TRANS_T) / ONE_BLK_SIZE - 1;
            if (gCol % c0Size_ || srcGap >= UINT16_MAX) {
                // each block len is only 32B
                for (int i = 0; i < calcWidth; i++) {
                    for (int j = 0; j < height; j++) {
                        DataCopy(dst[dstOffset + i * calcHeightExr * BLOCK_CUBE * c0Size_ + j * c0Size_],
                                 src[srcOffset + j * gCol + i * c0Size_], { 1, 1, 0, 0 }, enhancedParams);
                    }
                }
            } else {
                // data copy stride is aligned
                for (int i = 0; i < calcWidth; i++) {
                    DataCopy(dst[dstOffset], src[srcOffset],
                             { static_cast<uint16_t>(height), 1, static_cast<uint16_t>(srcGap), 0 }, enhancedParams);
                    dstOffset += calcHeightExr * BLOCK_CUBE * c0Size_;
                    srcOffset += c0Size_;
                }
            }

            LocalTensor<TRANS_T> trans;
            // tail gm->ub pad zero, and then ub->l1
            trans = MATMUL_MODULE(MatmulVar)
                        ->GetLocalWorkspace(MATMUL_MODULE(MatmulVar)->GetNd2NzOffset())
                        .template ReinterpretCast<TRANS_T>();
            int tranSize = width * EACH_BLOCK_BYTES_MM_API / sizeof(SRC_T);
            trans.SetSize(tranSize);

            int tailSrcoffset = row * gCol + col + calcWidth * c0Size_;
            // ub->ub
            for (int i = 0; i < height; i++) {
                DataCopy(trans[i * c0Size_], src[tailSrcoffset], { 1, 1, 0, 0 }, enhancedParams);
                tailSrcoffset += gCol;
            }

            event_t eventIDMte2ToV = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE2_V));
            SetFlag<HardEvent::MTE2_V>(eventIDMte2ToV);
            WaitFlag<HardEvent::MTE2_V>(eventIDMte2ToV);

            // tail pad zero
            uint64_t mask[2];
            uint16_t mask_tail_16bit = ~((1 << tail) - 1);
            uint64_t mask_tail_64bit = mask_tail_16bit;
            mask[0] = mask_tail_64bit + (mask_tail_64bit << FIRST_16BIT_OFFSET_MM_API) +
                (mask_tail_64bit << SECOND_16BIT_OFFSET_MM_API) + (mask_tail_64bit << THIRD_16BIT_OFFSET_MM_API);
            mask[1] = mask[0];
            constexpr int32_t DUP_CEIL_NUM = 8;
            if constexpr (IsSameTypeV<TRANS_T, int8_t>) {
                LocalTensor<int16_t> tmpTrans = trans.template ReinterpretCast<int16_t>();
                Duplicate(tmpTrans, (int16_t)0, mask, static_cast<uint8_t>(CeilT<int32_t>(height, DUP_CEIL_NUM)), 1, DUP_CEIL_NUM);
            } else {
                Duplicate(trans, (TRANS_T)0, mask, static_cast<uint8_t>(CeilT<int32_t>(height, DUP_CEIL_NUM)), 1, DUP_CEIL_NUM);
            }

            event_t eventIDVToMte3 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3));
            SetFlag<HardEvent::V_MTE3>(eventIDVToMte3);
            WaitFlag<HardEvent::V_MTE3>(eventIDVToMte3);

            // ub->l1
            int heightAlignBlock = CeilT<int32_t>(height, BLOCK_CUBE);
            int tailDstOffset = heightAlignBlock * BLOCK_CUBE * c0Size_ * calcWidth;
            DataCopy(dst[tailDstOffset], trans, { static_cast<uint16_t>(height), 1, 0, 0 }, enhancedParams);
        } else {
            int srcGap = gCol * sizeof(TRANS_T) / ONE_BLK_SIZE - 1;
            if (gCol % c0Size_ || srcGap >= UINT16_MAX) {
                int oriSrcOffset = srcOffset;
                int oriDstOffset = dstOffset;
                // each block len is only 32B
                for (int i = 0; i < calcWidth; i++) {
                    for (int j = 0; j < height; j++) {
                        DataCopy(dst[dstOffset], src[srcOffset], { 1, 1, 0, 0 }, enhancedParams);
                        dstOffset += c0Size_;
                        srcOffset += gCol;
                    }
                    srcOffset = oriSrcOffset + (i + 1) * c0Size_;
                    dstOffset = oriDstOffset + (i + 1) * calcHeightExr * BLOCK_CUBE * c0Size_;
                }
            } else {
                // data copy stride is aligned
                for (int i = 0; i < calcWidth; i++) {
                    DataCopy(dst[dstOffset], src[srcOffset],
                             { static_cast<uint16_t>(height), 1, static_cast<uint16_t>(srcGap), 0 }, enhancedParams);
                    dstOffset += calcHeightExr * BLOCK_CUBE * c0Size_;
                    srcOffset += c0Size_;
                }
            }
        }
    }

    template <bool IS_TRANS = false>
    __aicore__ inline void CopyND2NZWithTransData(const LocalTensor<TRANS_T>& dst, LocalTensor<SRC_T>& src,
                                                  const int row, const int col, const int tileHeight,
                                                  const int tileWidth)
    {
        int calcWidth = CeilT(tileWidth, c0Size_) * c0Size_;
        int calcHigh = CeilT(tileHeight, c0Size_) * c0Size_;
        int64_t size = calcHigh * calcWidth;
        LocalTensor<TRANS_T> rightMatrix = MATMUL_MODULE(MatmulVar)
                                             ->GetLocalWorkspace(MATMUL_MODULE(MatmulVar)->GetNd2NzOffset())
                                             .template ReinterpretCast<TRANS_T>();
        rightMatrix.SetSize(size);
        int srcOffset = row * MATMUL_MODULE(MatmulShapeInfo)->template GetBaseHeight<IS_TRANS>() *
                            MATMUL_MODULE(MatmulShapeInfo)->template GetOrgWidth<IS_TRANS>() +
                        col * MATMUL_MODULE(MatmulShapeInfo)->template GetBaseWidth<IS_TRANS>();
        int dstOffset = 0;
        for (int i = 0; i < tileHeight; i++) {
            DataCopy(rightMatrix[dstOffset], src[srcOffset], calcWidth);
            srcOffset += MATMUL_MODULE(MatmulShapeInfo)->template GetOrgWidth<IS_TRANS>();
            dstOffset += calcWidth;
        }
        LocalTensor<TRANS_T> trans = MATMUL_MODULE(MatmulVar)
                                         ->GetLocalWorkspace(MATMUL_MODULE(MatmulVar)->GetNd2NzOffset() + size)
                                         .template ReinterpretCast<TRANS_T>();
        trans.SetSize(size);
        event_t eventIDMte3ToV = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE3_V));
        SetFlag<HardEvent::MTE3_V>(eventIDMte3ToV);
        WaitFlag<HardEvent::MTE3_V>(eventIDMte3ToV);
        TransDataNDBMatrix(trans, rightMatrix, tileHeight, tileWidth);
        event_t eventIDVToMte3 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3));
        SetFlag<HardEvent::V_MTE3>(eventIDVToMte3);
        WaitFlag<HardEvent::V_MTE3>(eventIDVToMte3);
        CopyNZ2NZ(dst, trans, 0, 0, calcWidth, calcHigh, calcWidth);
    }

    template <bool IS_TRANS = false>
    __aicore__ inline void CopyND2NZWithTransData(const LocalTensor<TRANS_T>& dst, GlobalTensor<SRC_T>& src,
                                                  const int row, const int col, const int tileHeight,
                                                  const int tileWidth)
    {
        int calcWidth = CeilT(tileWidth, c0Size_) * c0Size_;
        int calcHigh = CeilT(tileHeight, c0Size_) * c0Size_;
        int64_t size = calcHigh * calcWidth;
        LocalTensor<TRANS_T> rightMatrix = MATMUL_MODULE(MatmulVar)
                                             ->GetLocalWorkspace(MATMUL_MODULE(MatmulVar)->GetNd2NzOffset())
                                             .template ReinterpretCast<TRANS_T>();
        rightMatrix.SetSize(size);
        int srcOffset = row * MATMUL_MODULE(MatmulShapeInfo)->template GetBaseHeight<IS_TRANS>() *
                            MATMUL_MODULE(MatmulShapeInfo)->template GetOrgWidth<IS_TRANS>() +
                        col * MATMUL_MODULE(MatmulShapeInfo)->template GetBaseWidth<IS_TRANS>();
        int dstOffset = 0;
        for (int i = 0; i < tileHeight; i++) {
            DataCopy(rightMatrix[dstOffset], src[srcOffset], calcWidth);
            srcOffset += MATMUL_MODULE(MatmulShapeInfo)->template GetOrgWidth<IS_TRANS>();
            dstOffset += calcWidth;
        }
        LocalTensor<TRANS_T> trans = MATMUL_MODULE(MatmulVar)
                                         ->GetLocalWorkspace(MATMUL_MODULE(MatmulVar)->GetNd2NzOffset() + size)
                                         .template ReinterpretCast<TRANS_T>();
        trans.SetSize(size);
        event_t eventIDMte2ToV = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE2_V));
        SetFlag<HardEvent::MTE2_V>(eventIDMte2ToV);
        WaitFlag<HardEvent::MTE2_V>(eventIDMte2ToV);
        TransDataNDBMatrix(trans, rightMatrix, tileHeight, tileWidth);
        event_t eventIDVToMte3 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3));
        SetFlag<HardEvent::V_MTE3>(eventIDVToMte3);
        WaitFlag<HardEvent::V_MTE3>(eventIDVToMte3);
        CopyNZ2NZ(dst, trans, 0, 0, calcWidth, calcHigh, calcWidth);
    }

    template <bool IS_TRANS = false>
    __aicore__ inline void CopyNZ2NZWithTransData(const LocalTensor<TRANS_T>& dst, LocalTensor<SRC_T>& src,
                                                  const int row, const int col, const int tileHeight,
                                                  const int tileWidth)
    {
        int64_t size = tileHeight * tileWidth;
        LocalTensor<TRANS_T> trans = MATMUL_MODULE(MatmulVar)
                                         ->GetLocalWorkspace(MATMUL_MODULE(MatmulVar)->GetNd2NzOffset() + size)
                                         .template ReinterpretCast<TRANS_T>();
        trans.SetSize(size);
        int srcOffset = row * MATMUL_MODULE(MatmulShapeInfo)->template GetBaseHeight<IS_TRANS>() * c0Size_ +
                        col * MATMUL_MODULE(MatmulShapeInfo)->template GetBaseWidth<IS_TRANS>() *
                            MATMUL_MODULE(MatmulShapeInfo)->template GetOrgHeight<IS_TRANS>();
        TransDataNZBMatrix<IS_TRANS>(trans, src[srcOffset], tileHeight, tileWidth);
        event_t eventIDVToMte3 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3));
        SetFlag<HardEvent::V_MTE3>(eventIDVToMte3);
        WaitFlag<HardEvent::V_MTE3>(eventIDVToMte3);
        CopyNZ2NZ(dst, trans, 0, 0, tileWidth, tileHeight, tileWidth);
    }

    template <bool IS_TRANS = false>
    __aicore__ inline void CopyNZ2NZWithTransData(const LocalTensor<TRANS_T>& dst, GlobalTensor<SRC_T>& src,
                                                  const int row, const int col, const int tileHeight,
                                                  const int tileWidth)
    {
        int calcWidth = CeilT(tileWidth, c0Size_) * c0Size_;
        int calcHigh = CeilT(tileHeight, c0Size_) * c0Size_;
        int64_t size = calcHigh * calcWidth;
        LocalTensor<TRANS_T> rightMatrix = MATMUL_MODULE(MatmulVar)
                                             ->GetLocalWorkspace(MATMUL_MODULE(MatmulVar)->GetNd2NzOffset())
                                             .template ReinterpretCast<TRANS_T>();
        rightMatrix.SetSize(size);
        int srcOffset = row * MATMUL_MODULE(MatmulShapeInfo)->template GetBaseHeight<IS_TRANS>() * c0Size_ +
                        col * MATMUL_MODULE(MatmulShapeInfo)->template GetBaseWidth<IS_TRANS>() *
                            MATMUL_MODULE(MatmulShapeInfo)->template GetOrgHeight<IS_TRANS>();
        int dstOffset = 0;
        int srcHigh = CeilT(MATMUL_MODULE(MatmulShapeInfo)->template GetOrgHeight<IS_TRANS>(), 16) * 16 * c0Size_;
        int dstHigh = tileHeight < c0Size_ ? tileHeight * c0Size_ : calcHigh * c0Size_;
        for (int i = 0; i < CeilT(tileWidth, c0Size_); i++) {
            DataCopy(rightMatrix[dstOffset], src[srcOffset], dstHigh);
            srcOffset += srcHigh;
            dstOffset += dstHigh;
        }
        LocalTensor<TRANS_T> trans = MATMUL_MODULE(MatmulVar)
                                         ->GetLocalWorkspace(MATMUL_MODULE(MatmulVar)->GetNd2NzOffset() + size)
                                         .template ReinterpretCast<TRANS_T>();
        trans.SetSize(size);
        event_t eventIDMte2ToV = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE2_V));
        SetFlag<HardEvent::MTE2_V>(eventIDMte2ToV);
        WaitFlag<HardEvent::MTE2_V>(eventIDMte2ToV);
        TransDataNZBMatrix<IS_TRANS>(trans, rightMatrix, tileHeight, tileWidth);
        event_t eventIDVToMte3 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3));
        SetFlag<HardEvent::V_MTE3>(eventIDVToMte3);
        WaitFlag<HardEvent::V_MTE3>(eventIDVToMte3);
        CopyNZ2NZ(dst, trans, 0, 0, calcWidth, calcHigh, calcWidth);
    }

    __aicore__ inline void TransDataNDBMatrix(const LocalTensor<TRANS_T>& dst, const LocalTensor<SRC_T>& src,
                                              int height, int width)
    {
        int iterK = CeilT(height, c0Size_);
        int iterN = CeilT(width, c0Size_);
        int calcWidth = iterN * c0Size_;
        int tailWidth = (width % c0Size_) > TRANS_DATA_ARRAY_SIZE_MM_API ? 0 : width % TRANS_DATA_ARRAY_SIZE_MM_API;
        TransDataTo5HDParams params;
        params.repeatTimes = iterK;
        params.dstRepStride = iterK == 1 ? 0 : calcWidth;
        params.srcRepStride = iterK == 1 ? 0 : calcWidth;
        int dstHighHalfOffset = TRANS_DATA_ARRAY_SIZE_MM_API * c0Size_;
        int srcHighHalfOffset = TRANS_DATA_ARRAY_SIZE_MM_API * calcWidth;
        iterN = tailWidth ? iterN - 1 : iterN;
        uint64_t dstLocalList[TRANS_DATA_ARRAY_SIZE_MM_API];
        uint64_t srcLocalList[TRANS_DATA_ARRAY_SIZE_MM_API];
        int dstOffset = 0;
        int srcOffset = 0;
        for (int curN = 0; curN < iterN; curN++) {
            int dstListOffset = 0;
            int srcListOffset = 0;
            for (int i = 0; i < TRANS_DATA_ARRAY_SIZE_MM_API; i++) {
                dstLocalList[i] = (uint64_t)(dst[dstOffset + dstListOffset].GetPhyAddr());
                srcLocalList[i] = (uint64_t)(src[srcOffset + srcListOffset].GetPhyAddr());
                dstListOffset += c0Size_;
                srcListOffset += calcWidth;
            }
            params.dstHighHalf = false;
            params.srcHighHalf = false;
            TransDataTo5HD<TRANS_T>(dstLocalList, srcLocalList, params);
            PipeBarrier<PIPE_V>();
            srcListOffset = 0;
            for (int i = 0; i < TRANS_DATA_ARRAY_SIZE_MM_API; i++) {
                srcLocalList[i] = (uint64_t)(src[srcOffset + srcListOffset + srcHighHalfOffset].GetPhyAddr());
                srcListOffset += calcWidth;
            }
            params.dstHighHalf = true;
            params.srcHighHalf = false;
            TransDataTo5HD<TRANS_T>(dstLocalList, srcLocalList, params);
            PipeBarrier<PIPE_V>();
            dstListOffset = 0;
            srcListOffset = 0;
            for (int i = 0; i < TRANS_DATA_ARRAY_SIZE_MM_API; i++) {
                dstLocalList[i] = (uint64_t)(dst[dstOffset + dstListOffset + dstHighHalfOffset].GetPhyAddr());
                srcLocalList[i] = (uint64_t)(src[srcOffset + srcListOffset].GetPhyAddr());
                dstListOffset += c0Size_;
                srcListOffset += calcWidth;
            }
            params.dstHighHalf = false;
            params.srcHighHalf = true;
            TransDataTo5HD<TRANS_T>(dstLocalList, srcLocalList, params);
            PipeBarrier<PIPE_V>();
            srcListOffset = 0;
            for (int i = 0; i < TRANS_DATA_ARRAY_SIZE_MM_API; i++) {
                srcLocalList[i] = (uint64_t)(src[srcOffset + srcListOffset + srcHighHalfOffset].GetPhyAddr());
                srcListOffset += calcWidth;
            }
            params.dstHighHalf = true;
            params.srcHighHalf = true;
            TransDataTo5HD<TRANS_T>(dstLocalList, srcLocalList, params);
            PipeBarrier<PIPE_V>();
            dstOffset += c0Size_ * c0Size_;
            srcOffset += c0Size_;
        }
        if (tailWidth) {
            dstOffset = iterN * c0Size_ * c0Size_;
            srcOffset = iterN * c0Size_;
            int dstListOffset = 0;
            int srcListOffset = 0;
            params.dstRepStride = iterK == 1 ? 0 : TRANS_DATA_ARRAY_SIZE_MM_API;
            for (int i = 0; i < TRANS_DATA_ARRAY_SIZE_MM_API; i++) {
                dstLocalList[i] = (uint64_t)(dst[dstOffset + dstListOffset].GetPhyAddr());
                srcLocalList[i] = (uint64_t)(src[srcOffset + srcListOffset].GetPhyAddr());
                dstListOffset += c0Size_;
                srcListOffset += calcWidth;
            }
            params.dstHighHalf = false;
            params.srcHighHalf = false;
            TransDataTo5HD<TRANS_T>(dstLocalList, srcLocalList, params);
            PipeBarrier<PIPE_V>();
            srcListOffset = 0;
            for (int i = 0; i < TRANS_DATA_ARRAY_SIZE_MM_API; i++) {
                srcLocalList[i] = (uint64_t)(src[srcOffset + srcListOffset + srcHighHalfOffset].GetPhyAddr());
                srcListOffset += calcWidth;
            }
            params.dstHighHalf = true;
            params.srcHighHalf = false;
            TransDataTo5HD<TRANS_T>(dstLocalList, srcLocalList, params);
            PipeBarrier<PIPE_V>();
        }
    }

    template <bool IS_TRANS = false>
    __aicore__ inline void TransDataNZBMatrix(const LocalTensor<TRANS_T>& dst, const LocalTensor<SRC_T>& src,
                                              int height, int width)
    {
        int iterK = CeilT(height, c0Size_);
        int iterN = CeilT(width, c0Size_);
        int calcWidth = iterN * c0Size_;
        int tailWidth = width % c0Size_;
        TransDataTo5HDParams params;
        params.repeatTimes = iterK;
        params.dstRepStride = iterK == 1 ? 0 : calcWidth;
        params.srcRepStride = iterK == 1 ? 0 : c0Size_;
        int dstHighHalfOffset = TRANS_DATA_ARRAY_SIZE_MM_API * c0Size_;
        int srcHighHalfOffset = TRANS_DATA_ARRAY_SIZE_MM_API * c0Size_;
        uint64_t dstLocalList[TRANS_DATA_ARRAY_SIZE_MM_API];
        uint64_t srcLocalList[TRANS_DATA_ARRAY_SIZE_MM_API];
        int dstOffset = 0;
        int srcOffset = 0;
        for (int curN = 0; curN < iterN; curN++) {
            params.dstRepStride =
                (curN == iterN - 1 && tailWidth > 0 && tailWidth < c0Size_) ? tailWidth : params.dstRepStride;
            int dstListOffset = 0;
            int srcListOffset = 0;
            for (int i = 0; i < TRANS_DATA_ARRAY_SIZE_MM_API; i++) {
                dstLocalList[i] = (uint64_t)(dst[dstOffset + dstListOffset + dstHighHalfOffset].GetPhyAddr());
                srcLocalList[i] = (uint64_t)(src[srcOffset + srcListOffset + srcHighHalfOffset].GetPhyAddr());
                dstListOffset += c0Size_;
                srcListOffset += c0Size_;
            }
            params.dstHighHalf = true;
            params.srcHighHalf = true;
            TransDataTo5HD<TRANS_T>(dstLocalList, srcLocalList, params);
            PipeBarrier<PIPE_V>();
            srcListOffset = 0;
            for (int i = 0; i < TRANS_DATA_ARRAY_SIZE_MM_API; i++) {
                srcLocalList[i] = (uint64_t)(src[srcOffset + srcListOffset].GetPhyAddr());
                srcListOffset += c0Size_;
            }
            params.dstHighHalf = false;
            params.srcHighHalf = true;
            TransDataTo5HD<TRANS_T>(dstLocalList, srcLocalList, params);
            PipeBarrier<PIPE_V>();
            dstListOffset = 0;
            for (int i = 0; i < TRANS_DATA_ARRAY_SIZE_MM_API; i++) {
                dstLocalList[i] = (uint64_t)(dst[dstOffset + dstListOffset].GetPhyAddr());
                dstListOffset += c0Size_;
            }
            params.dstHighHalf = false;
            params.srcHighHalf = false;
            TransDataTo5HD<TRANS_T>(dstLocalList, srcLocalList, params);
            PipeBarrier<PIPE_V>();
            srcListOffset = 0;
            for (int i = 0; i < TRANS_DATA_ARRAY_SIZE_MM_API; i++) {
                srcLocalList[i] = (uint64_t)(src[srcOffset + srcListOffset + srcHighHalfOffset].GetPhyAddr());
                srcListOffset += c0Size_;
            }
            params.dstHighHalf = true;
            params.srcHighHalf = false;
            TransDataTo5HD<TRANS_T>(dstLocalList, srcLocalList, params);
            PipeBarrier<PIPE_V>();
            dstOffset += c0Size_ * c0Size_;
            srcOffset +=
                MATMUL_MODULE(MatmulShapeInfo)->template GetOrgHeight<IS_TRANS>() * c0Size_;
        }
    }
    __aicore__ inline void AntiQuantCompute(const LocalTensor<TRANS_T>& quantOut, const LocalTensor<SRC_T>& quantIn,
                                            bool isBankConflict)
    {
#if __CCE_AICORE__ == 200
        LocalTensor<uint8_t> sharedLocal = GetSharedLocal();
        if constexpr (ToMatmulConfig(MM_CFG).isPerTensor) {
            AntiQuantComputePerTensor(quantOut, quantIn, sharedLocal);
        } else {
            uint32_t groupNum = 1;
            AntiQuantShapeInfo shapeInfo;
            if (MATMUL_MODULE(MatmulShapeInfo)->IsTranspose()) {
                uint32_t quantN = CeilAlignT<int32_t>(GetBaseUseN(), ANTI_QUANT_ALIGN_SIZE_MM_API);
                if constexpr (ToMatmulConfig(MM_CFG).hasAntiQuantOffset) {
                    shapeInfo.offsetHeight = quantN;
                    shapeInfo.offsetWidth = groupNum;
                }
                shapeInfo.scaleHeight = quantN;
                shapeInfo.scaleWidth = groupNum;
            } else {
                int quantN = CeilAlignT<int32_t>(GetBaseUseN(), ANTI_QUANT_ALIGN_SIZE_MM_API);
                uint32_t padNSize = isBankConflict ? quantN + ANTI_QUANT_ALIGN_SIZE_MM_API : quantN;
                if constexpr (ToMatmulConfig(MM_CFG).hasAntiQuantOffset) {
                    shapeInfo.offsetHeight = groupNum;
                    shapeInfo.offsetWidth = padNSize;
                }
                shapeInfo.scaleHeight = groupNum;
                shapeInfo.scaleWidth = padNSize;
            }
            if constexpr (ToMatmulConfig(MM_CFG).hasAntiQuantOffset) {
                if (MATMUL_MODULE(MatmulShapeInfo)->IsTranspose()) {
                    AscendAntiQuant<SRC_T, TRANS_T, true>(
                        quantOut, quantIn, MATMUL_MODULE(MatmulAntiQuantProcessor)->GetAntiQuantOffsetTensor(),
                        MATMUL_MODULE(MatmulAntiQuantProcessor)->GetAntiQuantScaleTensor(), sharedLocal,
                        CeilAlignT<int32_t>(GetBaseUseStepKb(), ANTI_QUANT_ALIGN_SIZE_MM_API), shapeInfo);
                } else {
                    AscendAntiQuant<SRC_T, TRANS_T, false>(
                        quantOut, quantIn, MATMUL_MODULE(MatmulAntiQuantProcessor)->GetAntiQuantOffsetTensor(),
                        MATMUL_MODULE(MatmulAntiQuantProcessor)->GetAntiQuantScaleTensor(), sharedLocal,
                        CeilAlignT<int32_t>(GetBaseUseStepKb(), ANTI_QUANT_ALIGN_SIZE_MM_API), shapeInfo);
                }
            } else {
                if (MATMUL_MODULE(MatmulShapeInfo)->IsTranspose()) {
                    AscendAntiQuant<SRC_T, TRANS_T, true>(
                        quantOut, quantIn, MATMUL_MODULE(MatmulAntiQuantProcessor)->GetAntiQuantScaleTensor(), sharedLocal,
                        CeilAlignT<int32_t>(GetBaseUseStepKb(), ANTI_QUANT_ALIGN_SIZE_MM_API), shapeInfo);
                } else {
                    AscendAntiQuant<SRC_T, TRANS_T, false>(
                        quantOut, quantIn, MATMUL_MODULE(MatmulAntiQuantProcessor)->GetAntiQuantScaleTensor(), sharedLocal,
                        CeilAlignT<int32_t>(GetBaseUseStepKb(), ANTI_QUANT_ALIGN_SIZE_MM_API), shapeInfo);
                }
            }
        }
#endif
    }

    __aicore__ inline void AntiQuantComputePerTensor(const LocalTensor<TRANS_T>& quantOut,
                                                     const LocalTensor<SRC_T>& quantIn,
                                                     const LocalTensor<uint8_t>& sharedLocal)
    {
#if __CCE_AICORE__ == 200
        if constexpr (ToMatmulConfig(MM_CFG).hasAntiQuantOffset) {
            if (MATMUL_MODULE(MatmulShapeInfo)->IsTranspose()) {
                AscendAntiQuant<SRC_T, TRANS_T, true>(
                    quantOut, quantIn, MATMUL_MODULE(MatmulAntiQuantProcessor)->GetAntiQuantOffsetScalar(),
                    MATMUL_MODULE(MatmulAntiQuantProcessor)->GetAntiQuantScaleScalar(), sharedLocal,
                    CeilAlignT<int32_t>(GetBaseUseStepKb(), ANTI_QUANT_ALIGN_SIZE_MM_API));
            } else {
                AscendAntiQuant<SRC_T, TRANS_T, false>(
                    quantOut, quantIn, MATMUL_MODULE(MatmulAntiQuantProcessor)->GetAntiQuantOffsetScalar(),
                    MATMUL_MODULE(MatmulAntiQuantProcessor)->GetAntiQuantScaleScalar(), sharedLocal,
                    CeilAlignT<int32_t>(GetBaseUseStepKb(), ANTI_QUANT_ALIGN_SIZE_MM_API));
            }
        } else {
            if (MATMUL_MODULE(MatmulShapeInfo)->IsTranspose()) {
                AscendAntiQuant<SRC_T, TRANS_T, true>(
                    quantOut, quantIn, MATMUL_MODULE(MatmulAntiQuantProcessor)->GetAntiQuantScaleScalar(), sharedLocal,
                    CeilAlignT<int32_t>(GetBaseUseStepKb(), ANTI_QUANT_ALIGN_SIZE_MM_API));
            } else {
                AscendAntiQuant<SRC_T, TRANS_T, false>(
                    quantOut, quantIn, MATMUL_MODULE(MatmulAntiQuantProcessor)->GetAntiQuantScaleScalar(), sharedLocal,
                    CeilAlignT<int32_t>(GetBaseUseStepKb(), ANTI_QUANT_ALIGN_SIZE_MM_API));
            }
        }
#endif
    }

    __aicore__ inline LocalTensor<uint8_t> GetSharedLocal()
    {
        LocalTensor<uint8_t> sharedLocal;
        if (MATMUL_MODULE(MatmulShapeInfo)->IsTranspose()) {
            int32_t scaleUbSize = MATMUL_MODULE(MatmulShapeInfo)->template GetBaseWidth() * 2 + 32 * 2;
            int32_t tmpBuffSize = 16 * MATMUL_MODULE(MatmulShapeInfo)->template GetBaseWidth() * 2 * sizeof(TRANS_T);
            ASCENDC_ASSERT((MATMUL_MODULE(MatmulVar)->GetTransLength() > (tmpBuffSize + scaleUbSize)), {
                KERNEL_LOG(KERNEL_ERROR, "transLength(%d) must be larger than tmpBuffSize(%d) + scaleUbSize(%d)",
                           MATMUL_MODULE(MatmulVar)->GetTransLength(), tmpBuffSize, scaleUbSize);
            });
            int32_t tmpBuffOffset = MATMUL_MODULE(MatmulVar)->GetTransLength() - tmpBuffSize - scaleUbSize;
            sharedLocal =
                MATMUL_MODULE(MatmulVar)->GetLocalWorkspace(tmpBuffOffset).template ReinterpretCast<uint8_t>();
            sharedLocal.SetSize(tmpBuffSize);
        }
        return sharedLocal;
    }

private:
    constexpr static int32_t c0Size_ = AuxGetC0Size<TRANS_T>();
    typename CubeInQueType<INPUT_TYPE>::QUE qidUBCache_;
    LocalTensor<SRC_T> cacheHead2UB_; // Allocate and release using qidUBCache_
    int32_t cache2UBProc_ = 0;
};
} // namespace matmul
#endif // IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_COPY_CUBE_IN_SET_UB_H
